We want to look at real datasets to simulate realistic datasets.

data("allen")

prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]
filterGenes <- apply(assay(prefilter) > 10, 1, sum) >= 10
sampleCells <- sample(ncol(prefilter), 100)

postfilter <- prefilter[filterGenes, sampleCells]
bio <- as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
postfilter <- assay(postfilter)

vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
core <- postfilter[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)

We will color-code the cells by either known cell type or by inferred cluster (inferred in the original study). Remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells, sample at random 100 cells, select 1,000 most variable genes. Then, dimensions are

dim(core)
## [1] 1000  100
par(mfrow = c(1,2))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(core == 0), xlim = c(3,8), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = '1,000 most variable genes')
smoothScatter(mean, rowMeans(core == 0), xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts")

#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts")

Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 268.035 143.760 162.351

True W

Our true W is

par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
     main="color = known cell-type")
#legend("bottomright", levels(bio), fill=cols)

plot(zinb@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2",
     main="color = inferred clusters")

#legend("topright", levels(cl), fill=cols2)
true_W <- zinb@W

Gamma

Gamma_mu and gamma_pi are not correlated in this dataset. Let’s simulate \[\gamma_{\mu} \sim \mathcal{N}(5, .5)\] \[\gamma_{\pi} \sim \mathcal{N}(-1, 2.5)\]

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

c(mean = mean(gamma_mu), sd = sd(gamma_mu))
##      mean        sd 
## 4.9465718 0.3887733
c(mean = mean(gamma_pi), sd = sd(gamma_pi))
##       mean         sd 
## -0.8150281  0.8255717
true_gamma_mu = rnorm(100, 5, 0.4)
true_gamma_pi = rnorm(100, -.8, .8)

true_gamma = data.frame(true_gamma_mu = true_gamma_mu,
                        true_gamma_pi = true_gamma_pi, 
                        celltype = bio)
true_gamma = melt(true_gamma)

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.1824662      0.2365889  0.8958176
## gamma_pi       -0.1824662  1.0000000     -0.9914158 -0.4572457
## detection_rate  0.2365889 -0.9914158      1.0000000  0.5103829
## coverage        0.8958176 -0.4572457      0.5103829  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi,
                   true_gamma_mu = true_gamma_mu, true_gamma_pi = true_gamma_pi,
                   celltype = bio)
gamma = melt(gamma)
gamma$status = sapply(gamma$variable, function(x) ifelse(grepl('true', x), 'simulated', 'fitted'))
gamma$variable = gsub('true_', '', gamma$variable)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(status ~ variable, scale = 'free_x') + xlab('gamma')

par(mfrow=c(1,2))
qqplot(gamma_mu, true_gamma_mu, main = 'QQplot - gamma_mu')
abline(a=0,b=1)
qqplot(gamma_pi, true_gamma_pi, main = 'QQplot - gamma_pi')
abline(a=0,b=1)

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() +
  facet_grid(status~ variable, scales = 'free_x') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
  facet_grid(status ~ variable) + xlab('gamma') + theme_bw()

Beta

rmixtNorm = function(n, m1, sd1, m2, sd2, p = .95, seed = NULL){
  if (!is.null(seed)) set.seed(seed)
  temp = cbind(rnorm(n, m1, sd1), rnorm(n, m2, sd2))
  id = sample(1:2, n, rep = T, prob = c(p,1-p))
  id = cbind(1:n,id)
  temp[id]
}

beta_mu and beta_pi are not correlated in the dataset. To simulate the long tail of the distributions of beta, let’s simulate beta_mu and beta_pi as a mixture of gaussians \[\beta_{\mu} \sim .9 \mathcal{N}(.8, .2) + .1 \mathcal{N}(0, 1)\] \[\beta_{\pi} \sim .9 \mathcal{N}(-1, 5) + .1 \mathcal{N}(-25, 10)\]

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

c(mean = mean(beta_mu), sd = sd(beta_mu))
##      mean        sd 
## 0.9923809 0.4641701
c(mean = mean(beta_pi), sd = sd(beta_pi))
##      mean        sd 
## -1.700859  6.031535
true_beta_mu = rmixtNorm(1000, .8, .4, 0, 1, seed = 123)
true_beta_pi = rmixtNorm(1000, -.4, .8, -25, 20, seed = 456)

df <- data.frame(beta_mu = beta_mu,
                 beta_pi = beta_pi)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.2587559
## beta_pi -0.2587559  1.0000000
true_beta = data.frame(true_beta_mu = true_beta_mu, true_beta_pi = true_beta_pi)
pairs(true_beta, pch=19)

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.2587559
## beta_pi -0.2587559  1.0000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi, 
                  true_beta_mu = true_beta_mu, 
                  true_beta_pi = true_beta_pi)
beta = melt(beta)
beta$status = sapply(beta$variable, function(x) ifelse(grepl('true', x), 'simulated', 'fitted'))
beta$variable = gsub('true_', '', beta$variable)

ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 50, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(status ~ variable, scales = 'free') + xlab('beta')

par(mfrow=c(1,2))
qqplot(beta_mu, true_beta_mu, main = 'QQplot - beta_mu')
abline(a=0,b=1)
qqplot(beta_pi, true_beta_pi, main = 'QQplot - beta_pi')
abline(a=0,b=1)

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(status ~ variable) +
  coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(status ~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 1919 rows containing non-finite values (stat_boxplot).

Alpha

alpha_mu and alpha_pi are not correlated.

par(mfrow=c(1,2))
plot(t(zinb@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")

alpha_mu_1 = zinb@alpha_mu[1, ]
alpha_mu_2 = zinb@alpha_mu[2, ]
alpha_pi_1 = zinb@alpha_pi[1, ]
alpha_pi_2 = zinb@alpha_pi[2, ]

df <- data.frame(alpha_mu_1 = alpha_mu_1,
                 alpha_mu_2 = alpha_mu_2,
                 alpha_pi_1 = alpha_pi_1,
                 alpha_pi_2 = alpha_pi_2)
pairs(df, pch=19, main = 'fitted alpha')

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2  alpha_pi_1  alpha_pi_2
## alpha_mu_1  1.00000000  0.10127903 -0.39681655 -0.08947395
## alpha_mu_2  0.10127903  1.00000000 -0.06465126 -0.26197093
## alpha_pi_1 -0.39681655 -0.06465126  1.00000000 -0.02167753
## alpha_pi_2 -0.08947395 -0.26197093 -0.02167753  1.00000000

alpha_mu and alpha_pi are not correlated in this dataset. Let’s simulate \[\alpha_{\mu1} \sim .97 \mathcal{N}(0, 0.05) + .03 \mathcal{N}(0, 1)\] \[\alpha_{\mu2} \sim .97 \mathcal{N}(0, 0.1) + .03 \mathcal{N}(0, 1)\] \[\alpha_{\pi1} \sim .97 \mathcal{N}(0, 0.5) + .03 \mathcal{N}(0, 10)\] \[\alpha_{\pi2} \sim .97 \mathcal{N}(0, 0.5) + .03 \mathcal{N}(0, 10)\]

c(mean = mean(alpha_mu_1), sd = sd(alpha_mu_1))
##        mean          sd 
## -0.07283686  0.18511472
c(mean = mean(alpha_mu_2), sd = sd(alpha_mu_2))
##       mean         sd 
## 0.02986855 0.21500970
c(mean = mean(alpha_pi_1), sd = sd(alpha_pi_1))
##       mean         sd 
## -0.3420094  2.3231649
c(mean = mean(alpha_pi_2), sd = sd(alpha_pi_2))
##       mean         sd 
## 0.04426955 1.57533333
true_alpha_mu_1 = rmixtNorm(1000, 0, 0.05, 0, 1, p =.97)
true_alpha_mu_2 = rmixtNorm(1000, 0, 0.1, 0, 1, p =.97)
true_alpha_pi_1 = rmixtNorm(1000, 0, .5, 0, 10, p =.97)
true_alpha_pi_2 = rmixtNorm(1000, 0, .5, 0, 10, p =.97)

df <- data.frame(true_alpha_mu_1 = true_alpha_mu_1,
                 true_alpha_mu_2 = true_alpha_mu_2,
                 true_alpha_pi_1 = true_alpha_pi_1,
                 true_alpha_pi_2 = true_alpha_pi_2)
pairs(df, pch=19, main = 'simulated alpha')

print(cor(df, method="spearman"))
##                 true_alpha_mu_1 true_alpha_mu_2 true_alpha_pi_1
## true_alpha_mu_1      1.00000000     -0.05292706     0.032025440
## true_alpha_mu_2     -0.05292706      1.00000000    -0.068814057
## true_alpha_pi_1      0.03202544     -0.06881406     1.000000000
## true_alpha_pi_2      0.05185383     -0.03684648     0.005222261
##                 true_alpha_pi_2
## true_alpha_mu_1     0.051853828
## true_alpha_mu_2    -0.036846481
## true_alpha_pi_1     0.005222261
## true_alpha_pi_2     1.000000000
alpha = data.frame(true_alpha_mu_1 = true_alpha_mu_1,
                   true_alpha_mu_2 = true_alpha_mu_2,
                   true_alpha_pi_1 = true_alpha_pi_1,
                   true_alpha_pi_2 = true_alpha_pi_2,
                   alpha_mu_1 = alpha_mu_1,
                   alpha_mu_2 = alpha_mu_2,
                   alpha_pi_1 = alpha_pi_1,
                   alpha_pi_2 = alpha_pi_2)
alpha = melt(alpha)
alpha$status = sapply(alpha$variable, function(x) ifelse(grepl('true', x), 'simulated', 'fitted'))
alpha$variable = gsub('true_', '', alpha$variable)
ggplot(alpha, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 50, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(status ~ variable, scales = 'free_x') + xlab('true_beta')

Dispersion

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.3745392
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')

par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))

par(mfrow = c(1, 2))
hist(zinb@zeta, xlab = 'fitted zeta', main = '')
c(mean = mean(zinb@zeta), sd = sd(zinb@zeta))
##       mean         sd 
## -0.7551300  0.4551841
true_zeta = rnorm(1000, -.75, .45)
hist(true_zeta, breaks = 15, xlab = 'simulated zeta', main = '')

Simulate

true_alpha_mu <- matrix(c(true_alpha_mu_1, true_alpha_mu_2),
                        nrow = 2, byrow = T)
true_alpha_pi <- matrix(c(true_alpha_pi_1, true_alpha_pi_2),
                        nrow = 2, byrow = T)
true_beta_mu <- matrix(true_beta_mu, nrow = 1)
true_beta_pi <- matrix(true_beta_pi, nrow = 1)
true_gamma_mu <- matrix(true_gamma_mu, nrow = 1)
true_gamma_pi <- matrix(true_gamma_mu, nrow = 1)

sim_obj <- zinbModel(W=true_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi, 
                     beta_mu=true_beta_mu, beta_pi=true_beta_pi,
                     zeta = true_zeta, gamma_mu=true_gamma_mu,
                     gamma_pi=true_gamma_pi)
sim_data <- zinbSim(sim_obj, seed=3984)

sim_data$zeroFraction
## [1] 0.92123
sum(core==0)/(nrow(core) * ncol(core))
## [1] 0.34272

Estimated mu and pi

detection_rate <- colMeans(core>0)
coverage <- colSums(core)
true_detection_rate <- rowMeans(sim_data$counts>0)
true_coverage <- rowSums(sim_data$counts)

par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), detection_rate,
     xlab="Average fitted Pi", ylab="Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(coverage), xlim = c(5, 7),
     xlab="Average fitted log Mu", ylab="log Coverage", pch=19,
     col=cols[bio])
plot(rowMeans(getPi(sim_obj)), true_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(sim_obj))), log1p(true_coverage), xlim = c(5, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=cols[bio])

par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Fitted',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)
smoothScatter(colMeans(log1p(getMu(sim_obj))), colMeans(getPi(sim_obj)),
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8), nbin = 256,
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

Why it does not look good?

df <- data.frame(alpha_pi_1 = alpha_pi_1,
                 alpha_pi_2 = alpha_pi_2,
                 beta_pi = beta_pi)
pairs(df, pch=19, main = 'fitted')

print(cor(df, method="spearman"))
##             alpha_pi_1  alpha_pi_2     beta_pi
## alpha_pi_1  1.00000000 -0.02167753  0.28375927
## alpha_pi_2 -0.02167753  1.00000000 -0.08034145
## beta_pi     0.28375927 -0.08034145  1.00000000
mod <- model.matrix(~ true_W)
zinb_sim <- zinbFit(core, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3, commondispersion=FALSE)
true_alpha_mu <- zinb_sim@beta_mu[-1,]
true_alpha_pi <- zinb_sim@beta_pi[-1,]
true_beta_mu <- zinb_sim@beta_mu[1,,drop=FALSE]
true_beta_pi <- zinb_sim@beta_pi[1,,drop=FALSE]
true_gamma_mu <- zinb_sim@gamma_mu
true_gamma_pi <- zinb_sim@gamma_pi

sim_obj <- zinbModel(W=true_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi, 
                     beta_mu=true_beta_mu, beta_pi=true_beta_pi,
                     zeta = true_zeta, gamma_mu=true_gamma_mu,
                     gamma_pi=true_gamma_pi)
sim_data <- zinbSim(sim_obj, seed=3984)
sim_data$zeroFraction
## [1] 0.3312
sum(core==0)/(nrow(core) * ncol(core))
## [1] 0.34272
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
true_detection_rate <- rowMeans(sim_data$counts>0)
true_coverage <- rowSums(sim_data$counts)

par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), detection_rate,
     xlab="Average fitted Pi", ylab="Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(coverage), xlim = c(5, 7),
     xlab="Average fitted log Mu", ylab="log Coverage", pch=19,
     col=cols[bio])
plot(rowMeans(getPi(sim_obj)), true_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(sim_obj))), log1p(true_coverage), xlim = c(5, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=cols[bio])

par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Fitted',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)
smoothScatter(colMeans(log1p(getMu(sim_obj))), colMeans(getPi(sim_obj)),
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8), nbin = 256,
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

B <- 50
sim_data <- lapply(seq_len(B), function(i) zinbSim(sim_obj, seed=i))
save(sim_obj, sim_data, file="sim_allen_k2_noCorr.rda")